Project 1 (EAS 509: Fall’23)

Project Title: Clustering Analysis of the Land Mines Data set

Team Members:

  1. Sujay Shrivastava (50496221) (sujayshr)
  2. Utkarsh Mathur (50495131) (umathur)
  3. Venkata Lakshmi Krishna Tejaswi Gudimetla (50496378) (vgudimet)

Project Dataset: Land Mines


Importing Libraries

# Visualization and Analysis
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(ggdendro)
library(readxl)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
suppressWarnings({
  library(readxl)
})

# Modeling and Inference
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(caret)
## Loading required package: lattice
library(nnet)
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-7
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
## 
##     select
## The following object is masked from 'package:dplyr':
## 
##     select
library(e1071)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

df <- read_excel("mine_data.xlsx")
summary(df)
##        V                H                S                M        
##  Min.   :0.1977   Min.   :0.0000   Min.   :0.0000   Min.   :1.000  
##  1st Qu.:0.3097   1st Qu.:0.2727   1st Qu.:0.2000   1st Qu.:2.000  
##  Median :0.3595   Median :0.5455   Median :0.6000   Median :3.000  
##  Mean   :0.4306   Mean   :0.5089   Mean   :0.5036   Mean   :2.953  
##  3rd Qu.:0.4826   3rd Qu.:0.7273   3rd Qu.:0.8000   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :5.000
df %>%
  ggplot(aes(y = M, x = V)) +
  geom_point(color = "blue") +
  theme_bw()

df %>%
  ggplot(aes(y = M, x = H)) +
  geom_point(color = "red") +
  theme_bw()

df %>%
  ggplot(aes(y = M, x = S)) +
  geom_point(color = "blue") +
  theme_bw()

K-Means Clustering

features <- df[, c("V", "H", "S")]
labels <- df[,c("M")]
kmeans_model <- kmeans(features, 5, nstart=20)
df$clusters <- kmeans_model$cluster
confusion_matrix <- table(df$clusters, df$M)
print(confusion_matrix)
##    
##      1  2  3  4  5
##   1  0 30  1  0  0
##   2 17 15 18 16 18
##   3 18 13 16 15 13
##   4 18  0 14 17 17
##   5 18 12 17 18 17
plot_ly(x=df$V, y=df$H, z=df$S, color=df$M, type="scatter3d")
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
plot_ly(x=df$V, y=df$H, z=df$S, color=df$clusters, type="scatter3d")
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

Analysis of Number of Clusters

k_values <- 1:8  # Adjust the range as needed
wss_values <- vector("numeric", length(k_values))

for (k in k_values) {
  kmeans_model <- kmeans(df[,1:3], k)
  wss <- sum(kmeans_model$withinss)
  wss_values[k] <- wss
}
elbow_plot <- ggplot(data.frame(k = k_values, wss = wss_values), aes(x = k, y = wss)) +
  geom_line() +
  geom_point() +
  labs(title = "Elbow Plot for K-means Clustering",
       x = "Number of Clusters (k)",
       y = "Within-Cluster Sum of Squares (WCSS)")
ggplotly(elbow_plot)
new_kmeans_model <- kmeans(features, 4, nstart=20)
df$clusters2 <- new_kmeans_model$cluster
plot_ly(x=df$V, y=df$H, z=df$S, color=df$clusters2, type="scatter3d")
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

Hierarchical Clustering

h_clust.complete <- hclust(dist(features), method = "complete")
h_clust.single <- hclust(dist(features), method = "single")
h_clust.average <- hclust(dist(features), method = "average")
ggplotly(ggdendrogram(h_clust.complete, rotate = FALSE, size = 2))
ggplotly(ggdendrogram(h_clust.single, rotate = FALSE, size = 2))
ggplotly(ggdendrogram(h_clust.average, rotate = FALSE, size = 2))

Classification Performances

set.seed(42)
sample1 <- sample(c(TRUE, FALSE), nrow(df), replace=TRUE, prob=c(0.8,0.2))
train1 <- df[sample1,]
test1 <- df[!sample1,]

train1$M <- factor(train1$M)
test1$M <- factor(test1$M)

Logistic Regression

# Fitting a logistic regression model
lr_model <- multinom(M ~ V + H + S, data=train1)
## # weights:  25 (16 variable)
## initial  value 436.157674 
## iter  10 value 337.522543
## iter  20 value 280.974520
## iter  30 value 280.278933
## iter  40 value 280.272250
## iter  50 value 280.271867
## final  value 280.271806 
## converged
# Print the summary of the model
print(summary(lr_model))
## Call:
## multinom(formula = M ~ V + H + S, data = train1)
## 
## Coefficients:
##   (Intercept)        V         H           S
## 2  -33.320353 64.78419 14.074729 -3.43616821
## 3  -11.744900 28.89174  4.209780 -0.77318354
## 4   -4.939808 14.50535  1.366517 -0.53371800
## 5   -9.471749 23.92129  2.916064 -0.03414481
## 
## Std. Errors:
##   (Intercept)        V         H         S
## 2    4.882370 8.977899 2.4262340 1.4245505
## 3    1.875182 4.471560 0.9939078 0.6889898
## 4    1.395354 3.751771 0.7654476 0.6029561
## 5    1.675540 4.167169 0.8911530 0.6572570
## 
## Residual Deviance: 560.5436 
## AIC: 592.5436
# Predict on the test set
lr_pred <- predict(lr_model, test1)

# Model Diagnostics
confusion_matrix_1 <- table(test1$M, lr_pred)

accuracy <- mean(diag(confusion_matrix_1))
precision <- precision(confusion_matrix_1)
recall <- recall(confusion_matrix_1)

print(accuracy)
## [1] 6.2
print(precision)
## [1] NA
print(recall)
## [1] NA
print(confusion_matrix_1)
##    lr_pred
##      1  2  3  4  5
##   1 15  0  1  4  1
##   2  0  9  1  0  0
##   3  0  1  4  3  6
##   4  3  0  1  0  6
##   5  1  0  6  2  3
plot_ly(
    x = c(1,2,3,4,5), y = c(1,2,3,4,5),
    z = confusion_matrix_1, type = "heatmap", colorscale = 'Greys'
)

Linear Support Vector Machine

# Fitting a SVM model
svm_model <- svm(M ~ V + H + S, data=train1, kernel="linear")

# Print the summary of the model
print(summary(svm_model))
## 
## Call:
## svm(formula = M ~ V + H + S, data = train1, kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  223
## 
##  ( 44 18 52 56 53 )
## 
## 
## Number of Classes:  5 
## 
## Levels: 
##  1 2 3 4 5
# Predict on the test set
svm_pred <- predict(svm_model, test1)

# Model Diagnostics
confusion_matrix_2 <- table(test1$M, svm_pred)

accuracy <- mean(diag(confusion_matrix_2))
precision <- precision(confusion_matrix_2)
recall <- recall(confusion_matrix_2)

print(accuracy)
## [1] 6.4
print(precision)
## [1] NA
print(recall)
## [1] NA
print(confusion_matrix_2)
##    svm_pred
##      1  2  3  4  5
##   1 15  0  2  2  2
##   2  0  9  1  0  0
##   3  0  1  4  3  6
##   4  3  0  2  0  5
##   5  1  0  6  1  4
plot_ly(
    x = c(1,2,3,4,5), y = c(1,2,3,4,5),
    z = confusion_matrix_2, type = "heatmap", colorscale = 'Greys'
)

Radial Support Vector Machine

# Fitting a SVM model
svm_model <- svm(M ~ V + H + S, data=train1, kernel="radial")

# Print the summary of the model
print(summary(svm_model))
## 
## Call:
## svm(formula = M ~ V + H + S, data = train1, kernel = "radial")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  239
## 
##  ( 47 31 52 56 53 )
## 
## 
## Number of Classes:  5 
## 
## Levels: 
##  1 2 3 4 5
# Predict on the test set
svm_pred <- predict(svm_model, test1)

# Model Diagnostics
confusion_matrix_3 <- table(test1$M, svm_pred)

accuracy <- mean(diag(confusion_matrix_3))
precision <- precision(confusion_matrix_3)
recall <- recall(confusion_matrix_3)

print(accuracy)
## [1] 4.8
print(precision)
## [1] NA
print(recall)
## [1] NA
print(confusion_matrix_3)
##    svm_pred
##     1 2 3 4 5
##   1 6 0 3 8 4
##   2 0 8 1 1 0
##   3 0 1 4 2 7
##   4 2 0 0 2 6
##   5 1 0 1 6 4
plot_ly(
    x = c(1,2,3,4,5), y = c(1,2,3,4,5),
    z = confusion_matrix_3, type = "heatmap", colorscale = 'Greys'
)